Merge the white mold datasets

Observational, agronomic, soils and weather variables

Author

Denis Shah

Published

Invalid Date

OBJECTIVE

Merge the different sources of variables linked to the field observations:
- the survey data (subject and wm response variable)
- agronomic (drainage, hydrol, year, cd, harv.optim)
- soils (ph, om, log_sand_clay, log_silt_clay)
- canopy closure at 35 dap
- total rain from planting to 35 dap (pre-bloom)
- total rain from 36-50 dap (bloom through early pod fill)
- vars identified from FDA:
- t2m_mean: start = 0, end = 4
- sm: start = -4, end = 3
- sm: start = 5, end = 15 - sm: start = 17, end = 24
- sm: start = 40, end = 49
- stsm: start = 35, end = 44

Packages

library(tidyverse)
library(kableExtra)

The survey data (subject and wm response variable)

# The observational (survey) matrix:
load(here::here("Data", "Survey.RData"))  # df

surv <-
  df %>% 
  # Filter out the PA fields (Potter county):
  dplyr::filter(! county == "Potter") %>% 
  dplyr::select(subject, latitude, longitude, sampling.date, wm) %>% 
  dplyr::filter(!is.na(latitude), !is.na(longitude)) %>% 
  dplyr::arrange(subject, sampling.date) %>% 
  dplyr::group_by(subject) %>%
  # The last sampling date for each field:
  dplyr::slice_max(sampling.date, n = 1, with_ties = FALSE) %>%
  dplyr::ungroup() %>% 
  dplyr::filter(!is.na(wm)) %>% 
  dplyr::mutate(wm = ifelse(wm > 0, 1, 0)) %>% 
  dplyr::select(subject, wm)

Agronomic (drainage, hydrol, cd, harv.optim)

agron <-
  df %>% 
  dplyr::filter(! county == "Potter") %>% 
  # Filter out the missing location data:
  dplyr::filter(!is.na(latitude), !is.na(longitude)) %>% 
  dplyr::group_by(subject) %>%
  # The last sampling date for each field:
  dplyr::slice_max(sampling.date, n = 1, with_ties = FALSE) %>%
  dplyr::ungroup() %>% 
  # Collapse drainage.class into two groups:
  dplyr::mutate(drainage = 
                  forcats::fct_collapse(drainage.class,
                                        `Poorly_Drained` = c("Somewhat poorly drained", "Poorly drained", "Very poorly drained"),
                                        `Well_Drained` = c("Somewhat excessively drained", "Well drained", "Moderately well drained"))) %>%
  # Collapse the dual categories of hydro.group into group D (natural condition):
  dplyr::mutate(hydrol = 
                  forcats::fct_collapse(hydro.group,
                                        `A` = "A",
                                        `B` = "B",
                                        `C` = "C",
                                        `D` = c("D", "A/D", "B/D", "C/D"))) %>%
  # If dap is >60, then consider the field beyond the optimal harvest time (60 dap): Create a binary variable to represent this:
  dplyr::mutate(harv.optim = ifelse(dap <= 60, 0, 1)) %>% 
  dplyr::mutate(harv.optim = factor(harv.optim, levels = c(0, 1), labels = c("Yes", "No"))) %>% 
  # Selecting vars with no missing values, and which don't have a lot of small obs in categories:
  dplyr::select(subject, drainage, hydrol, cd, harv.optim) %>% 
  # Removal of duplicated rows:
  dplyr::distinct()

Soils (ph, om, log_sand_clay, log_silt_clay)

soils <- 
  read.csv(here::here("Data", "extracted_soil_data.csv")) %>% 
  dplyr::select(-longitude, -latitude) %>% 
  # Rescale sand, silt, clay so that they add to 100 while respecting the proportionality among them
  dplyr::mutate(scaling_factor = 100/(clay+sand+silt)) %>% 
  dplyr::mutate(across(c(clay, sand, silt), ~ .x*scaling_factor)) %>% 
  dplyr::select(subject, ph, om, sand, silt, clay) %>% 
  # We want log ratios for sand, silt, clay as they are compositional.
  # Will use clay as the reference:
  dplyr::mutate(log_sand_clay = log(sand/clay)) %>% 
  dplyr::mutate(log_silt_clay = log(silt/clay)) %>% 
  dplyr::select(subject, ph, om, log_sand_clay, log_silt_clay)

Canopy closure at 35 dap

load(here::here("CanopyClosure", "cc.df.35dap.RData")) # cc.df.35dap

Rain

  • total rain from planting to 35 dap (pre-bloom)
  • total rain from 36-50 dap (bloom through early pod fill)
load(here::here("Openmeteo", "rain_vars.RData")) # rain_vars

Other environmental variables

  • vars identified from FDA:
    • t2m_mean: start = 0, end = 4
    • sm: start = -4, end = 3
    • sm: start = 5, end = 15
    • sm: start = 17, end = 24
    • sm: start = 40, end = 49
    • stsm: start = 35, end = 44
load(here::here("FunctionalDataAnalysis", "FunctiononScalar", "WeatherVars.RData"))  # weather.vars

Join em up!

X <- purrr::reduce(list(surv, agron, soils, cc.df.35dap, rain_vars, weather.vars), dplyr::left_join, by = "subject") %>% 
  # Rename vars to avoid things like a minus sign in a var name (avoid possible problems later on), more descriptive names. dbp = days before planting, dap = days after planting
   dplyr::rename(t2m_mean_to_4dap = `t2m_mean_0_4`) %>%
   dplyr::rename(sm_4dbp_to_3dap = `sm_-4_3`) %>% 
   dplyr::rename(sm_5dap_to_15dap = `sm_5_15`) %>% 
   dplyr::rename(sm_17dap_to_24dap = `sm_17_24`) %>% 
   dplyr::rename(sm_40dap_to_49dap = `sm_40_49`) %>% 
   dplyr::rename(stsm_35dap_to_44dap = `stsm_35_44`)

summary(X)
    subject          wm                  drainage   hydrol              cd     
 Min.   :  1   Min.   :0.000   Well_Drained  :266   A: 54   Central Lakes: 93  
 1st Qu.: 98   1st Qu.:0.000   Poorly_Drained: 90   B: 15   Great Lakes  :263  
 Median :200   Median :0.000                        C: 56                      
 Mean   :218   Mean   :0.205                        D:231                      
 3rd Qu.:349   3rd Qu.:0.000                                                   
 Max.   :440   Max.   :1.000                                                   
 harv.optim       ph             om        log_sand_clay   log_silt_clay   
 Yes:270    Min.   :4.72   Min.   :0.179   Min.   :-0.99   Min.   :-0.147  
 No : 86    1st Qu.:5.86   1st Qu.:0.491   1st Qu.: 0.28   1st Qu.: 1.033  
            Median :6.01   Median :0.523   Median : 0.74   Median : 1.102  
            Mean   :5.96   Mean   :0.529   Mean   : 0.74   Mean   : 1.100  
            3rd Qu.:6.13   3rd Qu.:0.549   3rd Qu.: 0.99   3rd Qu.: 1.170  
            Max.   :6.45   Max.   :1.545   Max.   : 3.41   Max.   : 1.587  
      cc35        rainto35dap    rain36to50dap   t2m_mean_to_4dap
 Min.   : 24.5   Min.   : 36.0   Min.   :  8.0   Min.   :10.7    
 1st Qu.: 44.0   1st Qu.: 79.9   1st Qu.: 25.2   1st Qu.:18.1    
 Median : 50.6   Median :114.4   Median : 46.6   Median :20.4    
 Mean   : 51.5   Mean   :122.2   Mean   : 53.5   Mean   :19.8    
 3rd Qu.: 55.0   3rd Qu.:161.2   3rd Qu.: 78.1   3rd Qu.:21.5    
 Max.   :124.3   Max.   :241.7   Max.   :138.9   Max.   :25.5    
 sm_4dbp_to_3dap sm_5dap_to_15dap sm_17dap_to_24dap sm_40dap_to_49dap
 Min.   :0.098   Min.   :0.078    Min.   :0.073     Min.   :0.070    
 1st Qu.:0.264   1st Qu.:0.266    1st Qu.:0.254     1st Qu.:0.253    
 Median :0.302   Median :0.304    Median :0.297     Median :0.302    
 Mean   :0.296   Mean   :0.297    Mean   :0.290     Mean   :0.296    
 3rd Qu.:0.341   3rd Qu.:0.354    3rd Qu.:0.343     3rd Qu.:0.362    
 Max.   :0.396   Max.   :0.401    Max.   :0.412     Max.   :0.401    
 stsm_35dap_to_44dap
 Min.   : 39.5      
 1st Qu.: 56.9      
 Median : 65.9      
 Mean   : 75.6      
 3rd Qu.: 83.0      
 Max.   :285.5      
names(X)
 [1] "subject"             "wm"                  "drainage"           
 [4] "hydrol"              "cd"                  "harv.optim"         
 [7] "ph"                  "om"                  "log_sand_clay"      
[10] "log_silt_clay"       "cc35"                "rainto35dap"        
[13] "rain36to50dap"       "t2m_mean_to_4dap"    "sm_4dbp_to_3dap"    
[16] "sm_5dap_to_15dap"    "sm_17dap_to_24dap"   "sm_40dap_to_49dap"  
[19] "stsm_35dap_to_44dap"
# Save the data frame:
save(X, file = here::here("DataFusion", "FusedData.RData"))

Session Info

sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default


locale:
[1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8   
[3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C                      
[5] LC_TIME=Portuguese_Brazil.utf8    

time zone: America/Sao_Paulo
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] kableExtra_1.4.0 lubridate_1.9.3  forcats_1.0.0    stringr_1.5.1   
 [5] dplyr_1.1.4      purrr_1.0.2      readr_2.1.5      tidyr_1.3.1     
 [9] tibble_3.2.1     ggplot2_3.5.1    tidyverse_2.0.0  knitr_1.48      

loaded via a namespace (and not attached):
 [1] gtable_0.3.5      jsonlite_1.8.9    compiler_4.4.1    renv_1.1.2       
 [5] tidyselect_1.2.1  xml2_1.3.6        systemfonts_1.1.0 scales_1.3.0     
 [9] yaml_2.3.10       fastmap_1.2.0     here_1.0.1        R6_2.5.1         
[13] generics_0.1.3    htmlwidgets_1.6.4 rprojroot_2.0.4   munsell_0.5.1    
[17] svglite_2.1.3     pillar_1.9.0      tzdb_0.4.0        rlang_1.1.4      
[21] utf8_1.2.4        stringi_1.8.4     xfun_0.48         viridisLite_0.4.2
[25] timechange_0.3.0  cli_3.6.3         withr_3.0.2       magrittr_2.0.3   
[29] digest_0.6.37     grid_4.4.1        rstudioapi_0.17.0 hms_1.1.3        
[33] lifecycle_1.0.4   vctrs_0.6.5       evaluate_1.0.1    glue_1.8.0       
[37] fansi_1.0.6       colorspace_2.1-1  rmarkdown_2.28    tools_4.4.1      
[41] pkgconfig_2.0.3   htmltools_0.5.8.1